Skip to content

Conversation

@gaurav
Copy link
Collaborator

@gaurav gaurav commented Aug 28, 2025

We have several requests/needs for files containing CURIEs to be normalized in bulk. There are multiple ways of doing this using NodeNorm, but it would be nice to have something that can do it as an INNER JOIN against the combined DuckDB database we create while building NodeNorm, as that should be way faster than other approaches. We could also use this to export every mapping we have from a particular source (i.e. https://github.com/TranslatorSRI/NodeNormalization/issues/321).

WIP. Should be merged after PR #495.

gaurav and others added 30 commits August 2, 2025 21:34
diff --git c/src/babel_utils.py i/src/babel_utils.py
index a96120d..5cbab9c 100644
--- c/src/babel_utils.py
+++ i/src/babel_utils.py
@@ -5,13 +5,15 @@ from enum import Enum
 from ftplib import FTP
 from io import BytesIO
 import gzip
-from datetime import datetime as dt
+from datetime import datetime as dt, datetime
 from datetime import timedelta
 import time
 import requests
 import os
 import urllib
 import jsonlines
+import yaml
+
 from src.node import NodeFactory, SynonymFactory, DescriptionFactory, InformationContentFactory, TaxonFactory
 from src.util import Text, get_config
 from src.LabeledID import LabeledID
@@ -349,10 +351,11 @@ def get_numerical_curie_suffix(curie):
     return None

-def write_compendium(synonym_list,ofname,node_type,labels={},extra_prefixes=[],icrdf_filename=None):
+def write_compendium(metadata_yamls, synonym_list,ofname,node_type,labels={},extra_prefixes=[],icrdf_filename=None):
     """
+    :param metadata_yaml: The YAML files containing the metadata for this compendium.
     :param synonym_list:
-    :param ofname:
+    :param ofname: Output filename. A file with this filename will be created in both the `compendia` and `synonyms` output directories.
     :param node_type:
     :param labels: A map of identifiers
         Not needed if each identifier will have a label in the correct directory (i.e. downloads/PMID/labels for PMID:xxx).
@@ -371,6 +374,32 @@ def write_compendium(synonym_list,ofname,node_type,labels={},extra_prefixes=[],i
     node_factory = NodeFactory(make_local_name(''),biolink_version)
     synonym_factory = SynonymFactory(make_local_name(''))

+    # Write out the metadata.yaml file combining information from all the metadata.yaml files.
+    metadata_dir = os.path.join(cdir,'metadata')
+    os.makedirs(metadata_dir, exist_ok=True)
+    with open(os.path.join(cdir, ofname + '.yaml'), 'w') as outf:
+        metadata = {
+            'type': 'compendium',
+            'name': ofname,
+            'created_at': datetime.now().isoformat(),
+            'concords': {}
+        }
+        for metadata_yaml in metadata_yamls:
+            metadata_block = yaml.safe_load(metadata_yaml)
+            if metadata_block is None:
+                raise ValueError("Metadata file {metadata_yaml} is empty.")
+
+            metadata_name = metadata_block['name']
+
+            if metadata_name in metadata['concords']:
+                logging.error(f"Duplicate metadata block name {metadata_name}!")
+                logging.error(f"New metadata block from {metadata_yaml}: {metadata_block}!")
+                logging.error(f"Existing metadata block: {metadata['concords'][metadata_name]}!")
+                raise ValueError(f"Metadata file {metadata_yaml} is named {metadata_name}, but this has already been loaded.")
+            metadata['concords'][metadata_name] = metadata_block
+
+        outf.write(yaml.dump(metadata))
+
     # Load the preferred_name_boost_prefixes -- this tells us which prefixes to boost when
     # coming up with a preferred label for a particular Biolink class.
     preferred_name_boost_prefixes = config['preferred_name_boost_prefixes']
diff --git c/src/babel_utils.py i/src/babel_utils.py
index f973337..59a5360 100644
--- c/src/babel_utils.py
+++ i/src/babel_utils.py
@@ -14,6 +14,7 @@ import urllib
 import jsonlines
 import yaml

+from src.metadata.provenance import write_combined_metadata
 from src.node import NodeFactory, SynonymFactory, DescriptionFactory, InformationContentFactory, TaxonFactory
 from src.util import Text, get_config
 from src.LabeledID import LabeledID
@@ -559,44 +560,17 @@ def write_compendium(metadata_yamls, synonym_list, ofname, node_type, labels={},
                     exit()

     # Write out the metadata.yaml file combining information from all the metadata.yaml files.
-    metadata_dir = os.path.join(cdir,'metadata')
-    os.makedirs(metadata_dir, exist_ok=True)
-    with open(os.path.join(cdir, 'metadata', ofname + '.yaml'), 'w') as outf:
-        # TODO: move into metadata/provenance.py
-        metadata = {
-            'type': 'compendium',
-            'name': ofname,
-            'created_at': datetime.now().isoformat(),
-            'counts': {
-                'cliques': count_cliques,
-                'eq_ids': count_eq_ids,
-                'synonyms': count_synonyms,
-            },
-            'concords': {}
-        }
-        for metadata_yaml in metadata_yamls:
-            with open(metadata_yaml, 'r') as metaf:
-                metadata_block = yaml.safe_load(metaf)
-                if metadata_block is None or metadata_block == {}:
-                    raise ValueError("Metadata file {metadata_yaml} is empty.")
-
-                if 'name' not in metadata_block:
-                    raise ValueError(f"Metadata file {metadata_yaml} is missing a 'name' field: {metadata_block}")
-
-                metadata_name = metadata_block['name']
-
-                if type(metadata_name) != str:
-                    raise ValueError(f"Metadata file {metadata_yaml} has a 'name' field that is not a string: {metadata_block}")
-
-                if metadata_name in metadata['concords']:
-                    # If it's not already a list, then make it into a list.
-                    if type(metadata['concords'][metadata_name]) != list:
-                        metadata['concords'][metadata_name] = [metadata['concords'][metadata_name]]
-                    metadata['concords'][metadata_name].append(metadata_block)
-                else:
-                    metadata['concords'][metadata_name] = metadata_block
-
-        yaml.dump(metadata, outf)
+    write_combined_metadata(
+        os.path.join(cdir, 'metadata', ofname + '.yaml'),
+        typ='compendium',
+        name=ofname,
+        counts={
+            'cliques': count_cliques,
+            'eq_ids': count_eq_ids,
+            'synonyms': count_synonyms,
+        },
+        combined_from_filenames=metadata_yamls,
+    )

 def glom(conc_set, newgroups, unique_prefixes=['INCHIKEY'],pref='HP',close={}):
     """We want to construct sets containing equivalent identifiers.
diff --git c/src/createcompendia/drugchemical.py i/src/createcompendia/drugchemical.py
index 2de4804..8dee460 100644
--- c/src/createcompendia/drugchemical.py
+++ i/src/createcompendia/drugchemical.py
@@ -1,5 +1,6 @@
 import csv

+from src.metadata.provenance import write_combined_metadata, write_concord_metadata
 from src.node import NodeFactory, InformationContentFactory
 from src.prefixes import RXCUI, PUBCHEMCOMPOUND, UMLS
 from src.categories import (CHEMICAL_ENTITY, DRUG, MOLECULAR_MIXTURE, FOOD, COMPLEX_MOLECULAR_MIXTURE,
@@ -139,7 +140,7 @@ def get_cui(x,indicator_column,cui_column,aui_column,aui_to_cui,sdui_to_cui):
         print(x)
         exit()

-def build_rxnorm_relationships(conso, relfile, outfile):
+def build_rxnorm_relationships(conso, relfile, outfile, metadata_yaml):
     """RXNREL is a lousy file.
     The subject and object can sometimes be a CUI and sometimes an AUI and you have to use
     CONSO to figure out how to go back and forth.
@@ -167,8 +168,32 @@ def build_rxnorm_relationships(conso, relfile, outfile):
     #This is maybe relying on convention a bit too much.
     if outfile == "UMLS":
         prefix = UMLS
+        sources = [
+            {
+                'type': 'UMLS',
+                'name': 'MRCONSO',
+                'filename': conso
+            },
+            {
+                'type': 'UMLS',
+                'name': 'MRREL',
+                'filename': relfile
+            }
+        ]
     else:
         prefix = RXCUI
+        sources = [
+            {
+                'type': 'RXNORM',
+                'name': 'RXNCONSO',
+                'filename': conso
+            },
+            {
+                'type': 'RXNOM',
+                'name': 'RXNREL',
+                'filename': relfile
+            }
+        ]
     aui_to_cui, sdui_to_cui = get_aui_to_cui(conso)
     # relfile = os.path.join('input_data', 'private', "RXNREL.RRF")
     single_use_relations = {"has_active_ingredient": defaultdict(set),
@@ -214,6 +239,13 @@ def build_rxnorm_relationships(conso, relfile, outfile):
                     continue
                 outf.write(f"{prefix}:{subject}\t{predicate}\t{prefix}:{next(iter(objects))}\n")

+    write_concord_metadata(
+        metadata_yaml,
+        name='build_rxnorm_relationships()',
+        description=f'Builds relationships between RxCUI and other identifiers from a CONSO ({conso}) and a REL ({relfile}).',
+        sources=sources
+    )
+

 def load_cliques(compendium):
     rx_to_clique = {}
@@ -228,7 +260,7 @@ def load_cliques(compendium):
                    rx_to_clique[terms["i"]] = clique
     return rx_to_clique

-def build_pubchem_relationships(infile,outfile):
+def build_pubchem_relationships(infile,outfile, metadata_yaml):
     with open(infile,"r") as inf:
         document = json.load(inf)
     with open(outfile,"w") as outf:
@@ -238,7 +270,19 @@ def build_pubchem_relationships(infile,outfile):
             for cid in cids:
                 outf.write(f"{RXCUI}:{rxnid}\tlinked\t{PUBCHEMCOMPOUND}:{cid}\n")

-def build_conflation(manual_concord_filename, rxn_concord, umls_concord, pubchem_rxn_concord, drug_compendium, chemical_compendia, icrdf_filename, outfilename):
+    write_concord_metadata(
+        metadata_yaml,
+        name='build_pubchem_relationships()',
+        description=f'Builds relationships between RxCUI and PubChem Compound identifiers from a PubChem annotations file ({infile}.',
+        sources=[{
+            'type': 'PubChem',
+            'name': 'PubChem RxNorm annotations',
+            'description': 'PubChem RxNorm mappings generated by pubchem.pull_rxnorm_annotations()',
+            'filename': infile
+        }]
+    )
+
+def build_conflation(manual_concord_filename, rxn_concord, umls_concord, pubchem_rxn_concord, drug_compendium, chemical_compendia, icrdf_filename, outfilename, input_metadata_yamls, output_metadata_yaml):
     """RXN_concord contains relationshps between rxcuis that can be used to conflate
     Now we don't want all of them.  We want the ones that are between drugs and chemicals,
     and the ones between drugs and drugs.
@@ -556,6 +600,15 @@ def build_conflation(manual_concord_filename, rxn_concord, umls_concord, pubchem
             outfile.write(f"{json.dumps(final_conflation_id_list)}\n")
             written.add(fs)

+    # Write out metadata.yaml
+    write_combined_metadata(
+        output_metadata_yaml,
+        typ='conflation',
+        name='drugchemical.build_conflation()',
+        description='Build DrugChemical conflation.',
+        combined_from_filenames=input_metadata_yamls
+    )
+

 def sort_by_curie_suffix(curie):
     """
diff --git c/src/metadata/provenance.py i/src/metadata/provenance.py
index 54bc50e..5a8f703 100644
--- c/src/metadata/provenance.py
+++ i/src/metadata/provenance.py
@@ -1,3 +1,4 @@
+import os.path
 from datetime import datetime

 import yaml
@@ -8,13 +9,56 @@ def write_download_metadata(filename, name, url='', description='', sources=None
 def write_concord_metadata(filename, name, url='', description='', sources=None, counts=None):
     write_metadata(filename, 'concord', name, url=url, description=description, sources=sources, counts=None)

-def write_metadata(filename, typ, name, sources=None, url='', description='', counts=None):
-    if type(name) != str:
+def write_combined_metadata(filename, typ, name, sources=None, url='', description='', counts=None, combined_from_filenames=None):
+    combined_from = {}
+    if combined_from_filenames is not None:
+        for metadata_yaml in combined_from_filenames:
+            with open(metadata_yaml, 'r') as metaf:
+                metadata_block = yaml.safe_load(metaf)
+                if metadata_block is None or metadata_block == {}:
+                    raise ValueError("Metadata file {metadata_yaml} is empty.")
+
+                if 'name' not in metadata_block:
+                    raise ValueError(f"Metadata file {metadata_yaml} is missing a 'name' field: {metadata_block}")
+
+                metadata_name = metadata_block['name']
+
+                if type(metadata_name) is not str:
+                    raise ValueError(f"Metadata file {metadata_yaml} has a 'name' field that is not a string: {metadata_block}")
+
+                if metadata_name in combined_from:
+                    # If it's not already a list, then make it into a list.
+                    if type(combined_from[metadata_name]) is not list:
+                        combined_from[metadata_name] = [combined_from[metadata_name]]
+                    combined_from[metadata_name].append(metadata_block)
+                else:
+                    combined_from[metadata_name] = metadata_block
+
+    write_metadata(
+        filename,
+        typ=typ,
+        name=name,
+        sources=sources,
+        url=url,
+        description=description,
+        counts=counts,
+        combined_from=combined_from
+    )
+
+def write_metadata(filename, typ, name, sources=None, url='', description='', counts=None, combined_from=None):
+    if type(typ) is not str:
+        raise ValueError(f"Metadata entry type must be a string, not {type(typ)}: '{typ}'")
+    if type(name) is not str:
         raise ValueError(f"Metadata entry name must be a string, not {type(name)}: '{name}'")
     if sources is None:
         sources = []
     if counts is None:
         counts = []
+    if combined_from is None:
+        combined_from = []
+
+    metadata_dir = os.path.dirname(filename)
+    os.makedirs(metadata_dir, exist_ok=True)
     with open(filename, 'w') as fout:
         yaml.dump({
             'created_at': datetime.now().isoformat(),
@@ -24,4 +68,5 @@ def write_metadata(filename, typ, name, sources=None, url='', description='', co
             'description': description,
             'sources': sources,
             'counts': counts,
+            'combined_from': combined_from,
         }, fout)
diff --git c/src/snakefiles/drugchemical.snakefile i/src/snakefiles/drugchemical.snakefile
index 9640c13..3f6a8d3 100644
--- c/src/snakefiles/drugchemical.snakefile
+++ i/src/snakefiles/drugchemical.snakefile
@@ -1,6 +1,7 @@
 import src.createcompendia.drugchemical as drugchemical
 import src.synonyms.synonymconflation as synonymconflation
 import src.snakefiles.util as util
+from src.metadata.provenance import write_concord_metadata

 ### Drug / Chemical

@@ -9,39 +10,56 @@ rule rxnorm_relationships:
         rxnconso = config['download_directory'] + "/RxNorm/RXNCONSO.RRF",
         rxnrel = config['download_directory'] + "/RxNorm/RXNREL.RRF",
     output:
-        outfile_concords = config['intermediate_directory'] + '/drugchemical/concords/RXNORM'
+        outfile_concords = config['intermediate_directory'] + '/drugchemical/concords/RXNORM',
+        metadata_yaml = config['intermediate_directory'] + '/drugchemical/concords/metadata-RXNORM.yaml'
     run:
-        drugchemical.build_rxnorm_relationships(input.rxnconso, input.rxnrel, output.outfile_concords)
+        drugchemical.build_rxnorm_relationships(input.rxnconso, input.rxnrel, output.outfile_concords, output.metadata_yaml)

 rule umls_relationships:
     input:
         umlsconso = config['download_directory'] + "/UMLS/MRCONSO.RRF",
         umlsrel = config['download_directory'] + "/UMLS/MRREL.RRF",
     output:
-        outfile_concords = config['intermediate_directory'] + '/drugchemical/concords/UMLS'
+        outfile_concords = config['intermediate_directory'] + '/drugchemical/concords/UMLS',
+        metadata_yaml = config['intermediate_directory'] + '/drugchemical/concords/metadata-UMLS.yaml'
     run:
-        drugchemical.build_rxnorm_relationships(input.umlsconso, input.umlsrel, output.outfile_concords)
+        drugchemical.build_rxnorm_relationships(input.umlsconso, input.umlsrel, output.outfile_concords, output.metadata_yaml)

 rule pubchem_rxnorm_relationships:
     input:
         infile = config['download_directory'] + '/PUBCHEM.COMPOUND/RXNORM.json',
     output:
-        outfile_concords = config['intermediate_directory'] + '/drugchemical/concords/PUBCHEM_RXNORM'
+        outfile_concords = config['intermediate_directory'] + '/drugchemical/concords/PUBCHEM_RXNORM',
+        metadata_yaml = config['intermediate_directory'] + '/drugchemical/concords/metadata-PUBCHEM_RXNORM.yaml'
     run:
-        drugchemical.build_pubchem_relationships(input.infile,output.outfile_concords)
+        drugchemical.build_pubchem_relationships(input.infile,output.outfile_concords, output.metadata_yaml)

 rule drugchemical_conflation:
     input:
         drug_compendium=config['output_directory']+'/compendia/'+'Drug.txt',
         chemical_compendia=expand("{do}/compendia/{co}", do=config['output_directory'], co=config['chemical_outputs']),
         rxnorm_concord=config['intermediate_directory']+'/drugchemical/concords/RXNORM',
+        rxnorm_metadata=config['intermediate_directory']+'/drugchemical/concords/metadata-RXNORM.yaml',
         umls_concord=config['intermediate_directory']+'/drugchemical/concords/UMLS',
+        umls_metadata=config['intermediate_directory']+'/drugchemical/concords/metadata-UMLS.yaml',
         pubchem_concord=config['intermediate_directory']+'/drugchemical/concords/PUBCHEM_RXNORM',
+        pubchem_metadata=config['intermediate_directory']+'/drugchemical/concords/metadata-PUBCHEM_RXNORM.yaml',
         drugchemical_manual_concord=config['input_directory']+'/manual_concords/drugchemical.tsv',
         icrdf_filename=config['download_directory']+'/icRDF.tsv',
     output:
-        outfile=config['output_directory']+'/conflation/DrugChemical.txt'
+        outfile=config['output_directory']+'/conflation/DrugChemical.txt',
+        metadata_yaml=config['output_directory']+'/conflation/metadata.yaml',
+        drugchemical_manual_metadata=config['intermediate_directory']+'/drugchemical/concords/metadata-Manual.yaml',
     run:
+        write_concord_metadata(input.drugchemical_manual_metadata,
+            name='Manual DrugChemical Concords',
+            description='Manually curated DrugChemical conflation cross-references from the Babel repository',
+            sources=[{
+                'name': 'Babel repository',
+                'url': 'https://github.com/TranslatorSRI/Babel',
+            }],
+            url='https://github.com/TranslatorSRI/Babel/blob/master/input_data/manual_concords/drugchemical.tsv',
+        )
         drugchemical.build_conflation(
             input.drugchemical_manual_concord,
             input.rxnorm_concord,
@@ -50,7 +68,13 @@ rule drugchemical_conflation:
             input.drug_compendium,
             input.chemical_compendia,
             input.icrdf_filename,
-            output.outfile)
+            output.outfile,
+            input_metadata_yamls={
+                'RXNORM': input.rxnorm_metadata,
+                'UMLS': input.umls_metadata,
+                'PUBCHEM_RXNORM': input.pubchem_metadata,
+                'Manual': input.drugchemical_manual_metadata,
+            }, output_metadata_yaml=output.metadata_yaml)

 rule drugchemical_conflated_synonyms:
     input:
gaurav added 30 commits August 17, 2025 18:06
Since MeSH is not an ids file for proteins, this should only pull in
MeSH IDs that are associated with a UMLS ID.
Could also be useful to track memory in the future.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Labels

None yet

Projects

None yet

Development

Successfully merging this pull request may close these issues.

2 participants